Learning-based models (A skewed view into Neural Networks)

Learning-based models (A skewed view into Neural Networks)#

import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import Button, Output, VBox, Checkbox, Label
from IPython.display import display

# --------------------------
# 1) Generate synthetic data
# --------------------------
np.random.seed(42)
N = 200
X = np.random.uniform(-5, 5, (N, 2))
# ground-truth: class 1 if y > x, else 0
y_true = (X[:,1] > X[:,0]).astype(int)

# --------------------------
# 2) Precompute best line by brute force
# --------------------------
def compute_accuracy(m, c):
    y_pred = (X[:,1] > (m*X[:,0] + c)).astype(int)
    return np.mean(y_pred == y_true), y_pred

# search grid
ms = np.linspace(-2, 2, 100)
cs = np.linspace(-2, 2, 100)
best_acc = 0.0
best_m, best_c = 0.0, 0.0

for m in ms:
    for c in cs:
        acc, _ = compute_accuracy(m, c)
        if acc > best_acc:
            best_acc, best_m, best_c = acc, m, c

# --------------------------
# 3) State holders
# --------------------------
current_m = None
current_c = None
last_acc = None
last_y_pred = None

# --------------------------
# 4) Widgets
# --------------------------
out = Output()
btn = Button(description="Generate Random Line", button_style='primary')
toggle = Checkbox(description="Show Best Line", value=True)
best_line_label = Label()

# --------------------------
# 5) Plotting routine
# --------------------------
def plot_decision_boundary(m, c, acc):
    with out:
        out.clear_output(wait=True)
        plt.figure(figsize=(7,7))
        # scatter by ground truth
        plt.scatter(X[y_true==0,0], X[y_true==0,1],
                    c='blue', label='Class 0 (true)')
        plt.scatter(X[y_true==1,0], X[y_true==1,1],
                    c='red',  label='Class 1 (true)')
        # random line
        xs = np.array([-5,5])
        plt.plot(xs, m*xs + c, 'k-', linewidth=2, label='Random Line')
        # best line?
        if toggle.value:
            plt.plot(xs, best_m*xs + best_c, 'k--', linewidth=2,
                     label=f'Best Line (m={best_m:.2f}, c={best_c:.2f})')
            best_line_label.value = (
                f"Best line → slope = {best_m:.2f}, intercept = {best_c:.2f}"
            )
        else:
            best_line_label.value = ""
        plt.xlim(-5,5)
        plt.ylim(-5,5)
        plt.title(f"Accuracy: {acc*100:.2f}%")
        plt.legend(loc='lower left')
        plt.show()

# --------------------------
# 6) Button callback
# --------------------------
def on_button_clicked(_):
    global current_m, current_c, last_acc, last_y_pred
    # pick new random line
    current_m = np.random.uniform(-2,2)
    current_c = np.random.uniform(-2,2)
    last_acc, last_y_pred = compute_accuracy(current_m, current_c)
    plot_decision_boundary(current_m, current_c, last_acc)

btn.on_click(on_button_clicked)

# --------------------------
# 7) Toggle callback (redraw same line)
# --------------------------
def on_toggle_change(change):
    if change['name']=='value' and current_m is not None:
        plot_decision_boundary(current_m, current_c, last_acc)

toggle.observe(on_toggle_change)

# --------------------------
# 8) Display
# --------------------------
display(VBox([btn, toggle, best_line_label, out]))

# initial draw
on_button_clicked(None)
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

# --------------------------
# 1) Generate synthetic data
# --------------------------
np.random.seed(0)
N = 200
X = np.random.uniform(-5, 5, (N, 2))
y = (X[:, 1] > X[:, 0]).astype(int)  # true labels

# Add bias term to X
X_bias = np.hstack([np.ones((N, 1)), X])  # shape (N, 3)

# --------------------------
# 2) Logistic regression GD
# --------------------------
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Hyperparameters
lr = 0.1
n_iters = 100

# Initialize θ = [b, w1, w2]
theta = np.random.randn(3) * 0.5

# Store θ at each step
thetas = np.zeros((n_iters, 3))

for i in range(n_iters):
    z = X_bias.dot(theta)
    y_pred = sigmoid(z)
    grad = (X_bias.T.dot(y_pred - y)) / N
    theta -= lr * grad
    thetas[i] = theta.copy()

# --------------------------
# 3) Set up the figure
# --------------------------
fig, ax = plt.subplots(figsize=(6, 6))
ax.set_xlim(-5, 5)
ax.set_ylim(-5, 5)

# scatter ground-truth
ax.scatter(X[y == 0, 0], X[y == 0, 1], c='blue', label='Class 0')
ax.scatter(X[y == 1, 0], X[y == 1, 1], c='red',  label='Class 1')

# line that will be updated
line, = ax.plot([], [], 'k-', linewidth=2)
title = ax.text(0.5, 1.03, "", transform=ax.transAxes, ha='center')

ax.legend(loc='lower right')

# --------------------------
# 4) Animation functions
# --------------------------
def init():
    line.set_data([], [])
    title.set_text("")
    return line, title

def update(frame):
    th0, th1, th2 = thetas[frame]
    # decision boundary: th0 + th1*x + th2*y = 0  →  y = -(th0 + th1*x)/th2
    xs = np.array([-5, 5])
    ys = -(th0 + th1 * xs) / th2
    line.set_data(xs, ys)

    # compute accuracy at this θ
    preds = sigmoid(X_bias.dot(thetas[frame])) > 0.5
    acc = np.mean(preds == y)

    title.set_text(f"Iter {frame+1}/{n_iters}   Accuracy: {acc*100:.1f}%")
    return line, title

anim = animation.FuncAnimation(
    fig, update, frames=n_iters, init_func=init,
    blit=True, interval=200, repeat=False
)

# --------------------------
# 5) Display in Jupyter
# --------------------------
plt.close(fig)       # hide extra static plot
HTML(anim.to_jshtml())
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML, display

# --------------------------
# 1) Generate “donut” data
# --------------------------
np.random.seed(0)
N = 400

# inner circle (class 0)
r1     = np.random.randn(N//2) * 0.2 + 1.0
theta1 = np.random.rand(N//2) * 2 * np.pi
X1     = np.column_stack([r1 * np.cos(theta1),
                          r1 * np.sin(theta1)])
y1     = np.zeros(N//2, dtype=int)

# outer ring (class 1)
r2     = np.random.randn(N//2) * 0.2 + 2.5
theta2 = np.random.rand(N//2) * 2 * np.pi
X2     = np.column_stack([r2 * np.cos(theta2),
                          r2 * np.sin(theta2)])
y2     = np.ones(N//2, dtype=int)

# combine
X = np.vstack([X1, X2])   # shape (N,2)
y = np.hstack([y1, y2])   # shape (N,)

# --------------------------
# 2) Polynomial features (degree 2)
#    φ(x,y) = [1, x, y, x², x·y, y²]
# --------------------------
def poly_features(X):
    x, y = X[:,0], X[:,1]
    return np.column_stack([
        np.ones(len(X)), x, y, x**2, x*y, y**2
    ])

X_poly = poly_features(X)  # shape (N,6)

# --------------------------
# 3) Logistic regression via gradient descent
# --------------------------
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

lr      = 0.1
n_iters = 100
m       = X_poly.shape[0]

theta = np.zeros(X_poly.shape[1])  # θ₀…θ₅
history = []

for i in range(n_iters):
    z      = X_poly.dot(theta)      # shape (N,)
    pred   = sigmoid(z)
    grad   = (X_poly.T.dot(pred - y)) / m
    theta -= lr * grad
    history.append(theta.copy())

# --------------------------
# 4) Prepare grid for plotting
# --------------------------
xs = np.linspace(-3, 3, 200)
ys = np.linspace(-3, 3, 200)
xx, yy = np.meshgrid(xs, ys)
grid   = np.column_stack([xx.ravel(), yy.ravel()])
grid_poly = poly_features(grid)   # shape (200*200,6)

# --------------------------
# 5) Set up figure
# --------------------------
fig, ax = plt.subplots(figsize=(6,6))
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)

# --------------------------
# 6) Animation update
# --------------------------
def update(frame):
    ax.cla()
    ax.set_xlim(-3, 3)
    ax.set_ylim(-3, 3)
    # scatter by ground truth
    ax.scatter(X[y==0,0], X[y==0,1], c='blue', label='Class 0')
    ax.scatter(X[y==1,0], X[y==1,1], c='red',  label='Class 1')
    # compute prediction probabilities on grid
    θ = history[frame]
    Z = sigmoid(grid_poly.dot(θ)).reshape(xx.shape)
    # contour at p=0.5
    ax.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
    ax.set_title(f"Iteration {frame+1}/{n_iters}", pad=12)
    ax.legend(loc='upper right')

# --------------------------
# 7) Create animation
# --------------------------
anim = animation.FuncAnimation(
    fig, update,
    frames=n_iters,
    blit=False,
    interval=200,
    repeat=False
)

# --------------------------
# 8) Display in Jupyter
# --------------------------
plt.close(fig)
display(HTML(anim.to_jshtml()))

# --------------------------
# 9) Print final boundary equation
# --------------------------
θ0, θ1, θ2, θ3, θ4, θ5 = theta
print(
    "Final decision boundary (p=0.5):\n"
    f"0 = {θ0:.3f} + {θ1:.3f}·x + {θ2:.3f}·y "
    f"+ {θ3:.3f}·x² + {θ4:.3f}·x·y + {θ5:.3f}·y²"
)
Final decision boundary (p=0.5):
0 = -1.508 + -0.069·x + -0.056·y + 0.641·x² + -0.081·x·y + 0.672·y²
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML, display

# --------------------------
# 1) Generate “donut” data
# --------------------------
np.random.seed(0)
N = 400

# inner circle (class 0)
r1     = np.random.randn(N//2) * 0.2 + 1.0
θ1     = np.random.rand(N//2) * 2 * np.pi
X1     = np.column_stack([r1 * np.cos(θ1), r1 * np.sin(θ1)])
y1     = np.zeros(N//2, dtype=int)

# outer ring (class 1)
r2     = np.random.randn(N//2) * 0.2 + 2.5
θ2     = np.random.rand(N//2) * 2 * np.pi
X2     = np.column_stack([r2 * np.cos(θ2), r2 * np.sin(θ2)])
y2     = np.ones(N//2, dtype=int)

# combine
X = np.vstack([X1, X2])   # shape (N,2)
y = np.hstack([y1, y2])   # shape (N,)

# --------------------------
# 2) Build a 3-layer MLP
#    input → hidden1(tanh) → hidden2(tanh) → output(sigmoid)
# --------------------------
def sigmoid(z): return 1/(1+np.exp(-z))

# network dimensions
D = 2
H1 = 16
H2 = 8

# initialize weights & biases
np.random.seed(1)
W1 = np.random.randn(H1, D) * 0.1
b1 = np.zeros((H1, 1))
W2 = np.random.randn(H2, H1) * 0.1
b2 = np.zeros((H2, 1))
W3 = np.random.randn(1, H2) * 0.1
b3 = np.zeros((1, 1))

# training hyperparams
lr      = 0.1
n_iters = 200
params_history = []

# training loop (batch GD)
for i in range(n_iters):
    # forward
    Z1 = W1.dot(X.T) + b1        # (H1, N)
    A1 = np.tanh(Z1)
    Z2 = W2.dot(A1) + b2         # (H2, N)
    A2 = np.tanh(Z2)
    Z3 = W3.dot(A2) + b3         # (1, N)
    A3 = sigmoid(Z3)             # (1, N)

    # compute gradient of loss w.r.t. Z3
    m = N
    dZ3 = A3 - y.reshape(1, -1)  # (1, N)
    dW3 = (dZ3.dot(A2.T)) / m    # (1, H2)
    db3 = np.sum(dZ3, axis=1, keepdims=True) / m

    # backprop to layer2
    dA2 = W3.T.dot(dZ3)          # (H2, N)
    dZ2 = dA2 * (1 - A2**2)      # (H2, N)
    dW2 = (dZ2.dot(A1.T)) / m    # (H2, H1)
    db2 = np.sum(dZ2, axis=1, keepdims=True) / m

    # backprop to layer1
    dA1 = W2.T.dot(dZ2)          # (H1, N)
    dZ1 = dA1 * (1 - A1**2)      # (H1, N)
    dW1 = (dZ1.dot(X)) / m       # (H1, D)
    db1 = np.sum(dZ1, axis=1, keepdims=True) / m

    # gradient descent update
    W3 -= lr * dW3; b3 -= lr * db3
    W2 -= lr * dW2; b2 -= lr * db2
    W1 -= lr * dW1; b1 -= lr * db1

    # record params
    params_history.append((W1.copy(), b1.copy(),
                           W2.copy(), b2.copy(),
                           W3.copy(), b3.copy()))

# --------------------------
# 3) Prepare a grid for boundary plotting
# --------------------------
xs = np.linspace(-3, 3, 200)
ys = np.linspace(-3, 3, 200)
xx, yy = np.meshgrid(xs, ys)
grid = np.column_stack([xx.ravel(), yy.ravel()])  # (40000, 2)

# --------------------------
# 4) Set up figure + scatter
# --------------------------
fig, ax = plt.subplots(figsize=(6,6))
ax.set_xlim(-3, 3); ax.set_ylim(-3, 3)
ax.scatter(X[y==0,0], X[y==0,1], c='blue', label='Class 0')
ax.scatter(X[y==1,0], X[y==1,1], c='red',  label='Class 1')
ax.legend(loc='upper right')
title = ax.text(0.5, 1.03, '', transform=ax.transAxes, ha='center')

# --------------------------
# 5) Animation update function
# --------------------------
def update(frame):
    ax.cla()
    ax.set_xlim(-3, 3); ax.set_ylim(-3, 3)
    # re-plot data
    ax.scatter(X[y==0,0], X[y==0,1], c='blue')
    ax.scatter(X[y==1,0], X[y==1,1], c='red')
    # get params
    W1_i, b1_i, W2_i, b2_i, W3_i, b3_i = params_history[frame]
    # forward on grid
    Z1g = W1_i.dot(grid.T) + b1_i
    A1g = np.tanh(Z1g)
    Z2g = W2_i.dot(A1g) + b2_i
    A2g = np.tanh(Z2g)
    Z3g = W3_i.dot(A2g) + b3_i
    Pg  = sigmoid(Z3g).reshape(xx.shape)
    # draw 0.5 contour
    ax.contour(xx, yy, Pg, levels=[0.5], colors='black', linewidths=2)
    ax.set_title(f"Iteration {frame+1}/{n_iters}")
    return []

# --------------------------
# 6) Create & display animation
# --------------------------
anim = animation.FuncAnimation(
    fig, update, frames=n_iters,
    blit=False, interval=100, repeat=False
)
plt.close(fig)
display(HTML(anim.to_jshtml()))
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# --------------------------
# 1) Load MNIST from sklearn
# --------------------------
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist['data'], mnist['target'].astype(int)

# Normalize pixel values to [0,1]
X = X / 255.0

# --------------------------
# 2) Visualize some samples
# --------------------------
plt.figure(figsize=(10, 2))
indices = np.random.choice(X.shape[0], 10, replace=False)
for i, idx in enumerate(indices):
    plt.subplot(1, 10, i + 1)
    plt.imshow(X[idx].reshape(28, 28), cmap='gray')
    plt.axis('off')
    plt.title(y[idx])
plt.suptitle("Random MNIST Samples")
plt.show()

# --------------------------
# 3) Train / test split
# --------------------------
X_train, X_test, y_train_int, y_test_int = train_test_split(
    X, y, test_size=10000, random_state=42
)

# One‐hot encode labels
num_classes = 10
y_train = np.eye(num_classes)[y_train_int]
y_test  = np.eye(num_classes)[y_test_int]

# --------------------------
# 4) Define a 2‐layer NN
#    Input (784) → Hidden (128, ReLU) → Output (10, softmax)
# --------------------------
def relu(Z):
    return np.maximum(0, Z)

def relu_deriv(Z):
    return (Z > 0).astype(float)

def softmax(Z):
    expZ = np.exp(Z - Z.max(axis=1, keepdims=True))
    return expZ / expZ.sum(axis=1, keepdims=True)

# Initialize weights
rng = np.random.RandomState(1)
input_size, hidden_size, output_size = 784, 128, 10
W1 = rng.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
b1 = np.zeros((1, hidden_size))
W2 = rng.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
b2 = np.zeros((1, output_size))

# --------------------------
# 5) Training hyperparameters
# --------------------------
lr = 0.1
epochs = 10
batch_size = 64
n_train = X_train.shape[0]

# --------------------------
# 6) Training loop (mini‐batch GD)
# --------------------------
for epoch in range(epochs):
    # Shuffle training data
    perm = rng.permutation(n_train)
    X_train, y_train = X_train[perm], y_train[perm]
    
    for i in range(0, n_train, batch_size):
        X_batch = X_train[i : i + batch_size]
        y_batch = y_train[i : i + batch_size]
        
        # Forward pass
        Z1 = X_batch.dot(W1) + b1           # (batch, hidden_size)
        A1 = relu(Z1)
        Z2 = A1.dot(W2) + b2               # (batch, output_size)
        A2 = softmax(Z2)
        
        # Backward pass
        m = X_batch.shape[0]
        dZ2 = (A2 - y_batch) / m           # (batch, output_size)
        dW2 = A1.T.dot(dZ2)                # (hidden_size, output_size)
        db2 = dZ2.sum(axis=0, keepdims=True)
        
        dA1 = dZ2.dot(W2.T)                # (batch, hidden_size)
        dZ1 = dA1 * relu_deriv(Z1)
        dW1 = X_batch.T.dot(dZ1)           # (input_size, hidden_size)
        db1 = dZ1.sum(axis=0, keepdims=True)
        
        # Parameter update
        W2 -= lr * dW2
        b2 -= lr * db2
        W1 -= lr * dW1
        b1 -= lr * db1

    # Optionally print progress
    # train_preds = np.argmax(softmax(relu(X_train.dot(W1)+b1).dot(W2)+b2), axis=1)
    # train_acc = np.mean(train_preds == np.argmax(y_train, axis=1))
    # print(f"Epoch {epoch+1}/{epochs}: train acc {train_acc*100:.2f}%")

# --------------------------
# 7) Evaluate on test set
# --------------------------
A1_test = relu(X_test.dot(W1) + b1)
A2_test = softmax(A1_test.dot(W2) + b2)
y_pred = np.argmax(A2_test, axis=1)
y_true = np.argmax(y_test, axis=1)

test_acc = np.mean(y_pred == y_true)
print(f"Test accuracy: {test_acc*100:.2f}%")
../_images/ecb3bf523f511d16169851777cad4b05df6824d780788c44bd164ea3695ff7b0.png
Test accuracy: 97.28%
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# --------------------------
# 1) Load MNIST and normalize
# --------------------------
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist['data'], mnist['target'].astype(int)
X = X / 255.0

# --------------------------
# 2) Show random samples before augmentation
# --------------------------
plt.figure(figsize=(10,2))
idxs = np.random.choice(len(X), 10, replace=False)
for i, idx in enumerate(idxs):
    plt.subplot(1, 10, i+1)
    plt.imshow(X[idx].reshape(28,28), cmap='gray')
    plt.axis('off')
    plt.title(y[idx])
plt.suptitle("Original MNIST Samples")
plt.show()

# --------------------------
# 3) Train/test split & one-hot encode
# --------------------------
X_train, X_test, y_train_int, y_test_int = train_test_split(
    X, y, test_size=10000, random_state=42
)
num_classes = 10
y_train = np.eye(num_classes)[y_train_int]
y_test  = np.eye(num_classes)[y_test_int]

# --------------------------
# 4) Define augmentation: shuffle 10% of pixels per image
# --------------------------
def augment_images(X, shuffle_frac=0.1):
    X_aug = np.empty_like(X)
    n_pixels = X.shape[1]
    n_shuffle = int(shuffle_frac * n_pixels)
    for i, img in enumerate(X):
        flat = img.copy()
        idxs = np.random.choice(n_pixels, size=n_shuffle, replace=False)
        vals = flat[idxs]
        np.random.shuffle(vals)
        flat[idxs] = vals
        X_aug[i] = flat
    return X_aug

X_train_aug = augment_images(X_train, shuffle_frac=0.1)

# --------------------------
# 5) Show before & after augmentation
# --------------------------
plt.figure(figsize=(12, 4))
samples = np.random.choice(len(X_train), 10, replace=False)
for i, idx in enumerate(samples):
    # original
    plt.subplot(2, 10, i+1)
    plt.imshow(X_train[idx].reshape(28,28), cmap='gray')
    plt.axis('off')
    if i == 0:
        plt.ylabel("Original")
    # augmented
    plt.subplot(2, 10, 10 + i+1)
    plt.imshow(X_train_aug[idx].reshape(28,28), cmap='gray')
    plt.axis('off')
    if i == 0:
        plt.ylabel("Shuffled")
plt.suptitle("Before and After Pixel-Shuffle Augmentation")
plt.show()

# --------------------------
# 6) Define network & helpers
# --------------------------
def relu(Z): return np.maximum(0, Z)
def relu_deriv(Z): return (Z > 0).astype(float)
def softmax(Z):
    expZ = np.exp(Z - Z.max(axis=1, keepdims=True))
    return expZ / expZ.sum(axis=1, keepdims=True)

def train_and_evaluate(X_tr, y_tr, X_te, y_te):
    rng = np.random.RandomState(1)
    # initialize weights
    input_size, hidden_size, output_size = 784, 128, 10
    W1 = rng.randn(input_size, hidden_size) * np.sqrt(2.0/input_size)
    b1 = np.zeros((1, hidden_size))
    W2 = rng.randn(hidden_size, output_size) * np.sqrt(2.0/hidden_size)
    b2 = np.zeros((1, output_size))
    
    # training hyperparams
    lr = 0.1
    epochs = 10
    batch_size = 64
    n_train = X_tr.shape[0]
    
    # training loop
    for epoch in range(epochs):
        perm = rng.permutation(n_train)
        X_shuf = X_tr[perm]
        y_shuf = y_tr[perm]
        for i in range(0, n_train, batch_size):
            X_batch = X_shuf[i:i+batch_size]
            y_batch = y_shuf[i:i+batch_size]
            # forward
            Z1 = X_batch.dot(W1) + b1
            A1 = relu(Z1)
            Z2 = A1.dot(W2) + b2
            A2 = softmax(Z2)
            # backward
            m = X_batch.shape[0]
            dZ2 = (A2 - y_batch) / m
            dW2 = A1.T.dot(dZ2)
            db2 = dZ2.sum(axis=0, keepdims=True)
            dA1 = dZ2.dot(W2.T)
            dZ1 = dA1 * relu_deriv(Z1)
            dW1 = X_batch.T.dot(dZ1)
            db1 = dZ1.sum(axis=0, keepdims=True)
            # update
            W2 -= lr * dW2; b2 -= lr * db2
            W1 -= lr * dW1; b1 -= lr * db1
    
    # evaluate
    A1_te = relu(X_te.dot(W1) + b1)
    A2_te = softmax(A1_te.dot(W2) + b2)
    preds = np.argmax(A2_te, axis=1)
    true  = np.argmax(y_te, axis=1)
    return np.mean(preds == true)

# --------------------------
# 7) Train & evaluate on both sets
# --------------------------
acc_orig = train_and_evaluate(X_train,     y_train, X_test, y_test)
acc_aug  = train_and_evaluate(X_train_aug, y_train, X_test, y_test)

# --------------------------
# 8) Print final results
# --------------------------
print(f"Original training set test accuracy:   {acc_orig*100:.2f}%")
print(f"Augmented training set test accuracy: {acc_aug*100:.2f}%")
../_images/ab06634be1953f464a25c5b9b402190ea844a6130a90800bc2abb8bdc1b16de6.png ../_images/ebcab9944a294ea01b846d5876d1ff72f8cace6be877e612e4b2ede8bb77b726.png
Original training set test accuracy:   97.22%
Augmented training set test accuracy: 96.99%
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# --------------------------
# 1) Load MNIST and normalize
# --------------------------
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist['data'], mnist['target'].astype(int)
X = X / 255.0

# --------------------------
# 2) Train/test split
# --------------------------
X_train, X_test, y_train_int, y_test_int = train_test_split(
    X, y, test_size=10000, random_state=42
)

# --------------------------
# 3) Encode labels as 4-bit binary vectors
# --------------------------
def int_to_bits(labels, num_bits=4):
    # MSB first: bit values [8,4,2,1]
    bits = ((labels.reshape(-1,1) >> np.arange(num_bits)[::-1]) & 1)
    return bits.astype(float)

def bits_to_int(bits):
    # bits shape (N,4), MSB first
    weights = 2**np.arange(bits.shape[1]-1, -1, -1)
    return bits.dot(weights)

y_train = int_to_bits(y_train_int, num_bits=4)
y_test  = int_to_bits(y_test_int,  num_bits=4)

# --------------------------
# 4) Define network & helpers
#    784 → 128 (ReLU) → 4 (sigmoid)
# --------------------------
def relu(Z):
    return np.maximum(0, Z)

def relu_deriv(Z):
    return (Z > 0).astype(float)

def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

rng = np.random.RandomState(1)
input_size, hidden_size, output_size = 784, 128, 4

W1 = rng.randn(input_size, hidden_size) * np.sqrt(2.0/input_size)
b1 = np.zeros((1, hidden_size))
W2 = rng.randn(hidden_size, output_size) * np.sqrt(2.0/hidden_size)
b2 = np.zeros((1, output_size))

# --------------------------
# 5) Training hyperparameters
# --------------------------
lr = 0.1
epochs = 10
batch_size = 64
n_train = X_train.shape[0]

# --------------------------
# 6) Training loop
# --------------------------
for epoch in range(1, epochs+1):
    # shuffle
    perm = rng.permutation(n_train)
    X_shuf = X_train[perm]
    y_shuf = y_train[perm]
    
    for i in range(0, n_train, batch_size):
        X_batch = X_shuf[i:i+batch_size]
        y_batch = y_shuf[i:i+batch_size]
        
        # forward
        Z1 = X_batch.dot(W1) + b1          # (batch,128)
        A1 = relu(Z1)
        Z2 = A1.dot(W2) + b2               # (batch,4)
        A2 = sigmoid(Z2)                   # (batch,4)
        
        # backward (binary cross-entropy)
        m = X_batch.shape[0]
        dZ2 = (A2 - y_batch) / m           # (batch,4)
        dW2 = A1.T.dot(dZ2)                # (128,4)
        db2 = dZ2.sum(axis=0, keepdims=True)
        
        dA1 = dZ2.dot(W2.T)                # (batch,128)
        dZ1 = dA1 * relu_deriv(Z1)
        dW1 = X_batch.T.dot(dZ1)           # (784,128)
        db1 = dZ1.sum(axis=0, keepdims=True)
        
        # update
        W2 -= lr * dW2;  b2 -= lr * db2
        W1 -= lr * dW1;  b1 -= lr * db1

# --------------------------
# 7) Evaluate on test set
# --------------------------
# forward
A1_test = relu(X_test.dot(W1) + b1)
A2_test = sigmoid(A1_test.dot(W2) + b2)  # (n_test,4)

# threshold bits at 0.5
bits_pred = (A2_test >= 0.5).astype(int)
y_pred_int = bits_to_int(bits_pred)
y_true_int = y_test_int

# classification accuracy
accuracy = np.mean(y_pred_int == y_true_int)
print(f"Test accuracy (4-bit encoding): {accuracy*100:.2f}%")
Test accuracy (4-bit encoding): 94.49%